/****************************************************************
 *								*
 *	Copyright 2001, 2008 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_string.h"

#include "gdsroot.h"
#include "gdskill.h"
#include "gdsblk.h"
#include "gdsbt.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsfhead.h"
#include "gdscc.h"
#include "filestruct.h"
#include "copy.h"
#include "jnl.h"
#include "hashtab_int4.h"	/* needed for tp.h */
#include "buddy_list.h"		/* needed for tp.h */
#include "tp.h"
#include "t_write.h"
#include "min_max.h"
#include "jnl_get_checksum.h"

GBLREF	cw_set_element	cw_set[];
GBLREF	unsigned char	cw_set_depth;
GBLREF	sgmnt_addrs	*cs_addrs;
GBLREF	sgm_info	*sgm_info_ptr;
GBLREF	short		dollar_tlevel;
GBLREF	trans_num	local_tn;	/* transaction number for THIS PROCESS */
GBLREF	gv_namehead	*gv_target;
GBLREF	uint4		t_err;
GBLREF	unsigned int	t_tries;
GBLREF	boolean_t	horiz_growth;
GBLREF	int4		prev_first_off, prev_next_off;
GBLREF	boolean_t	mu_reorg_process;

cw_set_element *t_write (
			srch_blk_status	*blkhist,	/* Search History of the block to be written. Currently the
							 *	following members in this structure are used by "t_write"
							 *	    "blk_num"		--> Block number being modified
							 *	    "buffaddr"		--> Address of before image of the block
							 *	    "cr->ondsk_blkver"	--> Actual block version on disk
							 */
			unsigned char 	*upd_addr,	/* Address of the update array that contains the changes for this block */
			block_offset 	ins_off,	/* Offset to the position in the buffer that is to receive
							 * 	a block number when one is created. */
			block_index 	index,		/* Index into the create/write set.  The specified entry is
							 * 	always a create entry. When the create gets assigned a
							 * 	block number, the block number is inserted into this
							 * 	buffer at the location specified by ins_off. */
			char		level,		/* Level of the block in the tree */
			boolean_t	first_copy,	/* Is first copy needed if overlaying same buffer? */
			boolean_t	forward,	/* Is forward processing required? */
			uint4		write_type)	/* Whether "killtn" of the bt needs to be simultaneously updated or not */
{
	cw_set_element		*cse, *tp_cse, *old_cse;
	off_chain		chain;
	uint4			iter;
	srch_blk_status		*tp_srch_status;
	ht_ent_int4		*tabent;
	block_id		blk;
	cache_rec_ptr_t		cr;
	boolean_t		new_cse;	/* TRUE if we had to create a new cse for the input block */
	jnl_buffer_ptr_t	jbbp;		/* jbbp is non-NULL only if before-image journaling */
	sgmnt_addrs		*csa;
	blk_hdr_ptr_t		old_block;
	unsigned int		bsiz;

	csa = cs_addrs;
	horiz_growth = FALSE;

	/* When the following two asserts trip, we should change the data types of prev_first_off
	 * and prev_next_off, so they satisfy the assert.
	 */
	assert(sizeof(prev_first_off) >= sizeof(block_offset));
	assert(sizeof(prev_next_off) >= sizeof(block_offset));

	blk = blkhist->blk_num;
	if (dollar_tlevel == 0)
	{
		if (blk >= csa->ti->total_blks)
			GTMASSERT;
		cse = &cw_set[cw_set_depth];
		cse->mode = gds_t_noop;	/* initialize it to a value that is not "gds_t_committed" before incrementing
					 * cw_set_depth as secshr_db_clnup relies on it */
		cw_set_depth++;
		assert(cw_set_depth < CDB_CW_SET_SIZE);
		assert(index < (int)cw_set_depth);
		new_cse = TRUE;
		tp_cse = NULL; /* dont bother returning tp_cse for non-TP; it's almost never needed and it distiguishes the cases */
	} else
	{
		assert(!index || index < sgm_info_ptr->cw_set_depth);
		chain = *(off_chain *)&blk;
		if (chain.flag == 1)
		{
			tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain.cw_index, &cse);
			blk = cse->blk;
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				tp_srch_status = (srch_blk_status *)tabent->value;
			else
				tp_srch_status = NULL;
			cse = tp_srch_status ? tp_srch_status->cse : NULL;
				/* tp_srch_status->cse always returns latest in the horizontal list */
	    	}
		assert(!cse || !cse->high_tlevel);
		if (cse == NULL)
		{
			tp_cw_list(&cse);
			sgm_info_ptr->cw_set_depth++;
			assert(gv_target);
			cse->blk_target = gv_target;
			gv_target->write_local_tn = local_tn;
			new_cse = TRUE;
		} else
		{
			new_cse = FALSE;
			assert(cse->done);
			assert(dollar_tlevel >= cse->t_level);
			if (cse->t_level != dollar_tlevel)
			{
				/* this part of the code is similar to that in gvcst_delete_blk(),
				 * any changes in one should be reflected in the other */
				horiz_growth = TRUE;
				old_cse = cse;
				cse = (cw_set_element *)get_new_free_element(sgm_info_ptr->tlvl_cw_set_list);
				memcpy(cse, old_cse, sizeof(cw_set_element));
				cse->low_tlevel = old_cse;
				cse->high_tlevel = NULL;
				old_cse->high_tlevel = cse;
				cse->t_level = dollar_tlevel;
				assert(2 == (sizeof(cse->undo_offset) / sizeof(cse->undo_offset[0])));
				assert(2 == (sizeof(cse->undo_next_off) / sizeof(cse->undo_next_off[0])));
				for (iter = 0; iter < 2; iter++)
					cse->undo_next_off[iter] = cse->undo_offset[iter] = 0;
				assert(old_cse->new_buff);
				assert(old_cse->done);
				cse->new_buff = NULL;
				if (PREV_OFF_INVALID != prev_first_off)
					old_cse->first_off = prev_first_off;
				if (PREV_OFF_INVALID != prev_next_off)
					old_cse->next_off = prev_next_off;
			}
			/* cse->mode can be kill_t_create or kill_t_write only if we have a restartable situation.
			 * this is because a TP transaction should never try modifying a block that is no longer visible in the
			 * tree. the only exception is if due to concurrency issues, we read a stale copy of a buffer that
			 * incorrectly led us to this child block number. this is a restartable situation.
			 * since this routine does not return a failure code, we continue and expect tp_tend to detect this.
			 */
			switch (cse->mode)
			{
				case kill_t_create:
					assert(CDB_STAGNATE > t_tries);
					cse->mode = gds_t_create;
					break;
				case kill_t_write:
					assert(CDB_STAGNATE > t_tries);
					cse->mode = gds_t_write;
					break;
				default:
					;
			}
		}
		tp_cse = cse;
	}
	if (new_cse)
	{
		cse->blk_checksum = 0;
		cse->blk = blk;
		cse->mode = gds_t_write;
		cse->new_buff = NULL;
		cse->old_block = blkhist->buffaddr;
		old_block = (blk_hdr_ptr_t)cse->old_block;
		assert(NULL != old_block);
		jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
		if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn))
		{	/* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to
			 * access the buffer more than the db blk_size.
			 */
			bsiz = MIN(old_block->bsiz, csa->hdr->blk_size);
			cse->blk_checksum = jnl_get_checksum((uint4*)old_block, bsiz);
		}
		/* the buffer in shared memory holding the GDS block contents currently does not have in its block header the
		 * on-disk format of that block. if it had, we could have easily copied that over to the cw-set-element.
		 * until then, we have to use the cache-record's field "ondsk_blkver". but the cache-record is available only in BG.
		 * thankfully, in MM, we do not allow GDSV4 type blocks, so we can safely assign GDSV5 (or GDSVCURR) to this field.
		 */
		cr = blkhist->cr;
		assert((NULL != cr) || (dba_mm == csa->hdr->acc_meth));
		cse->ondsk_blkver = (NULL == cr) ? (enum db_ver)GDSVCURR : cr->ondsk_blkver;
	} else
	{	/* we did not create a new cse. assert the integrity of few fields filled in when this cse was created */
		assert(cse->blk == blk);
		assert(0 == cse->reference_cnt);
		/* If we did not create a new cse, check that the level already stored in the cse is the same as the input level.
		 * It is possible that they are different but that would mean we are in one of two situations
		 *	1) A restartable situation. Since this routine does not currently return a failure code,
		 *		we do not restart here but instead wait for some other failure-code-returning-function
		 *		(if nothing else, the function tp_tend) to catch this situation and trigger a restart.
		 *	2) This block number is the root block of a GVT or Directory Tree and the height of the tree
		 *		is increasing now. In either case cse->blk_target points to the gv_target for that tree.
		 *		The only exception to this is if the global's root is being created.
		 */
		assert(cse->level == level || (CDB_STAGNATE > t_tries) || gds_t_create == cse->mode
			|| cse->blk_target->root == cse->blk);
	}
	cse->upd_addr = upd_addr;
	cse->ins_off = ins_off;
	cse->index = index;
	cse->reference_cnt = 0;
	cse->level = level;
	if (horiz_growth)
		cse->first_copy = TRUE;
	else
		cse->first_copy = first_copy;
	cse->done = FALSE;
	cse->forward_process = forward;
	cse->jnl_freeaddr = 0;		/* reset jnl_freeaddr that previous transaction might have filled in */
	cse->t_level = dollar_tlevel;
	/* All REORG operations should disable the "indexmod" optimization (C9B11-001813/C9H12-002934). Assert that. */
	assert(!mu_reorg_process || (GDS_WRITE_KILLTN == write_type));
	if (dollar_tlevel)
		cse->write_type |= write_type;
	else
		cse->write_type = write_type;
	prev_first_off = prev_next_off = PREV_OFF_INVALID;
	blkhist->cse = cse;	/* indicate to t_end/tp_tend that this block is part of the write-set */
	return tp_cse;
}
